Adapting methods described in the poster uploaded on Google Drive, I first calculated the mean pre-pubertal fE2 concentration. Because infants/juvs have elevated fE2 when nursing, we need to exclude these individuals when determining what typical pre-pubertal fE2 looks like.
Per Slack communication on 7/17: defining prebuteral E from
individuals greater than 2.5 years (def not nursing) and under 3 (def
not pubertal)
Per Zoom meeting on 7/22: defining prepubertal E from individuals between 3 and 4 years old.
## calculating average pre-pubertal E2
prepubertal_data<-e_data |>
filter(age_at_sample >= 3.0 & age_at_sample <4) |> # we want age 3..def pre menstruation
select(ind_id, age_at_sample, e_conc_ug)
prepubertal_e_data <- prepubertal_data |>
summarise(mean_e_conc_ug = mean(e_conc_ug, na.rm = TRUE),
sd_e_conc_ug = sd(e_conc_ug, na.rm = TRUE))
mean_prepubertal_e_conc_ug <- prepubertal_e_data$mean_e_conc_ug
sd_prepubertal_e_conc_ug <- prepubertal_e_data$sd_e_conc_ug
Using the mean + 2SD approach:
calculated_treshold <- mean_prepubertal_e_conc_ug + 2*sd_prepubertal_e_conc_ug
print(paste("Calculated fE2 threshold for menarche: ", calculated_treshold, " ug/g"))
## [1] "Calculated fE2 threshold for menarche: 2.84527656823452 ug/g"
Looking at the average age at which the population has an E reading that crosses the threshold E. Restricting dataset to individuals who we started sampling pre menstruation (before age 3.75); only looking for
# Identify the first age where e_conc_ug rises above the threshold for each individual
first_above_threshold <- e_data %>%
group_by(ind_id) %>%
filter(min(age_at_sample) <= 3.75) |> # want to make sure we have samples starting before menstruation to assign cutoff age
ungroup() |>
filter(age_at_sample>3.75) |> # only considering potentially pubescent if older than 4
filter(e_conc_ug > calculated_treshold) %>%
group_by(ind_id) %>%
summarise(first_age_above_threshold = min(age_at_sample, na.rm = TRUE)) %>%
ungroup()
# Calculate the mean and standard error of the ages
mean_age_above_threshold <- mean(first_above_threshold$first_age_above_threshold, na.rm = TRUE)
se_age_above_threshold <- sd(first_above_threshold$first_age_above_threshold, na.rm = TRUE) / sqrt(nrow(first_above_threshold))
print(paste("Average age at which e_conc_ug rises above the threshold: ", mean_age_above_threshold))
## [1] "Average age at which e_conc_ug rises above the threshold: 4.4375"
print(paste("Standard error of the average age: ", se_age_above_threshold))
## [1] "Standard error of the average age: 0.175038261123512"
For interpretability, this plot just has the data sorted into age-years, rather than exact age estimations with lots of decimals
e_data_for_menarche <- e_data %>%
filter(age_at_sample<8) |>
mutate(age_group = cut(age_at_sample, breaks = seq(0, max(age_at_sample, na.rm = TRUE) + 1, by = 1), right = FALSE))
## get sample size ----
n_figure_1 <- nrow(e_data_for_menarche)
##Figure 1
par(mfrow = c(1,1))
# Calculating mean and standard error for each age group
age_summary <- e_data_for_menarche %>%
group_by(age_group) %>%
summarise(
mean_e_at_age_years = mean(e_conc_ug, na.rm = TRUE),
se_e_at_age_years = sd(e_conc_ug, na.rm = TRUE) / sqrt(n())
) %>%
ungroup()
print("Grouping ages into years")
## [1] "Grouping ages into years"
ggplot(data = age_summary, aes(x = as.factor(age_group), y = mean_e_at_age_years)) +
geom_smooth() +
geom_point() +
geom_vline(xintercept = mean_age_above_threshold, linetype = "dashed", color = "red") +
geom_hline(yintercept = calculated_treshold, linetype = "dashed", color = "skyblue") +
geom_errorbar(aes(ymin = mean_e_at_age_years - se_e_at_age_years, ymax = mean_e_at_age_years + se_e_at_age_years), width = 0.2) +
geom_rect(aes(xmin = mean_age_above_threshold - 1.96 * se_age_above_threshold, xmax = mean_age_above_threshold + 1.96 * se_age_above_threshold,
ymin = -Inf, ymax = Inf), alpha = 0.05, fill = "pink") +
ggtitle(paste0("Average estradiol by age at sample (N = ", n_figure_1, " samples)")) +
xlab("Age at sample (years)") +
ylab("Fecal estradiol (ug/g)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
print("Putting all data points and applying a smoothing function")
## [1] "Putting all data points and applying a smoothing function"
ggplot(data = e_data_for_menarche, aes(x = age_at_sample, y = e_conc_ug)) +
geom_smooth()+
geom_point()+
geom_vline(xintercept = mean_age_above_threshold, linetype = "dashed", color = "red") +
geom_hline(yintercept = calculated_treshold, linetype = "dashed", color = "skyblue") +
# geom_errorbar(aes(ymin = mean_e_at_age_years - se_e_at_age_years, ymax = mean_e_at_age_years + se_e_at_age_years), width = 0.2) +
geom_rect(aes(xmin = mean_age_above_threshold - 1.96 * se_age_above_threshold, xmax = mean_age_above_threshold + 1.96 * se_age_above_threshold,
ymin = -Inf, ymax = Inf), alpha = 0.01, fill = "pink") +
ggtitle(paste0("Average estradiol by age at sample (N = ", n_figure_1, " samples)")) +
xlab("Age at sample (years)") +
ylab("Fecal estradiol (ug/g)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
print("Showing all datapoints with box plots for age-year groupings and smoothing")
## [1] "Showing all datapoints with box plots for age-year groupings and smoothing"
ggplot(data = e_data_for_menarche, aes(x = as.factor(floor(age_at_sample+1)), y = e_conc_ug)) +
geom_point(data = e_data_for_menarche, aes(x = age_at_sample, y = e_conc_ug),
alpha = 0.2, color = "green") +
geom_boxplot() +
geom_smooth(data = age_summary, aes(x = as.numeric(age_group), y = mean_e_at_age_years), method = "loess", color = "blue") +
geom_vline(xintercept = mean_age_above_threshold, linetype = "dashed", color = "red") +
geom_hline(yintercept = calculated_treshold, linetype = "dashed", color = "skyblue") +
geom_rect(aes(xmin = mean_age_above_threshold - 1.96 * se_age_above_threshold, xmax = mean_age_above_threshold + 1.96 * se_age_above_threshold,
ymin = -Inf, ymax = Inf), alpha = 0.01, fill = "pink") +
ggtitle(paste0("Estradiol levels by age at sample (N = ", n_figure_1, " samples)")) +
xlab("Age at sample (years)") +
ylab("Fecal estradiol (ug/g)") +
scale_x_discrete(labels = c("0-1", "1-2", "2-3", "3-4", "4-5", "5-6", "6-7", "7-8", "8-9", "9-10"))
## `geom_smooth()` using formula = 'y ~ x'
# ggplot(data = e_data |>
# filter(age_at_sample<10) |>
# select(age_at_sample_years, mean_e_at_age_years, se_e_at_age_years) |>
# distinct(), aes(x = age_at_sample_years, y = mean_e_at_age_years)) +
# geom_smooth() +
# geom_point()+
# geom_vline(xintercept = mean_age_above_threshold, linetype = "dashed", color = "red") +
# geom_hline(yintercept = calculated_treshold, linetype = "dashed", color = "skyblue") +
# geom_errorbar(aes(ymin = mean_e_at_age_years - se_e_at_age_years, ymax = mean_e_at_age_years + se_e_at_age_years), width = 0.2) +
# geom_rect(aes(xmin = mean_age_above_threshold - 1.96*se_age_at_menarche, xmax = mean_age_above_threshold + 1.96*se_age_at_menarche,
# ymin = -Inf, ymax = Inf), alpha = 0.05, fill = "pink") +
# ggtitle(paste0("Average estradiol by age at sample (N = ", n_figure_1, " samples)"))+
# xlab("Age at sample")+
# ylab("Fecal estradiol (ug/g)")
Interpreting the graph:
The black circles and error bars represent the average age fE2 reading (with standard error) for each age-year for the population
The blue line represents the results of a Loess regression, a non-parametric approach to modeling the relationship between x and y. The gray shading around the blue line is the error of the estimate.
The dashed light blue line is the calculated prepubertal fE threshold (y = 2.845277 ug/g)
The dashed red line is the calculated average age at puberty (x = 4.641429 years)
The shaded area around the dashed red line is the 95% CI for the average age at puberty estimate (3.22 ± 1.96* SE)
NOTE: I haven’t yet calculated individual E thresholds/ages at menarche because I think we are going to want a different approach than mean + 2sd….still working on that.
I am pulling data on females that we have good E coverage for before and after potential menarche…I’m looking for IDs with samples starting at least by age 3. At least 25 samples per ID. That leaves us with the following:
par(mfrow = c(1,1))
# Panel of all individual females who have good E coverage before and after menarche
females_for_menarche_panel<-e_data |>
group_by(ind_id) |>
filter(min(age_at_sample)<3) |>
mutate(count = n()) |>
ungroup() |>
select(ind_id,count) |>
arrange(desc(count)) |>
distinct() |>
filter(count>25) |>
pull(ind_id)
females_for_menarche_panel_df<- e_data |>
filter(ind_id %in% females_for_menarche_panel) |>
filter(age_at_sample < 8) |>
select(ind_id, age_at_sample, sample_id, date, month, time, hour, e_conc_ug, is_pregnant) |>
arrange(ind_id, age_at_sample) |>
mutate(date = as.Date(date))
n_figure_2 <-nrow(females_for_menarche_panel_df)
# facet wrap plots with geom_smooth
ggplot(data = females_for_menarche_panel_df, aes(x = age_at_sample, y = e_conc_ug, color = ind_id)) +
geom_point()+
facet_wrap(~ ind_id)+
geom_smooth()+
ggtitle(paste0("Estradiol by age at sample, separated by individual (n = ", n_figure_2, " samples)"))+
xlab("Age at sample")+
ylab("Fecal estradiol (ug/g)")+
theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Loop through each individual and create a ggplot for each
for (individual in females_for_menarche_panel) {
individual_data <- females_for_menarche_panel_df |>
filter(ind_id == individual)
# Get the age at which estradiol first exceeds the threshold for this individual
first_above_threshold_age_by_id <- first_above_threshold |>
filter(ind_id == individual) |>
pull(first_age_above_threshold)
print(paste0("Age at menarche for ", individual,": ", first_above_threshold_age_by_id))
# Get duckface dates
duckface_ages <- duckface_data %>%
filter(ind_id == individual) %>%
pull(age_at_sample)
# Create the plot
p <- ggplot(data = individual_data, aes(x = age_at_sample, y = e_conc_ug)) +
geom_point() +
geom_line(color = "blue") +
geom_vline(xintercept = first_above_threshold_age_by_id,
linetype = "dashed", color = "red") +
geom_vline(xintercept = as.numeric(duckface_ages), linetype = "solid", color = "purple") + # Add vertical lines for duckface dates
ggtitle(paste0("Estradiol by age at sample for individual ", individual, " (n = ", nrow(individual_data), " samples)")) +
xlab("Age at sample") +
ylab("Fecal estradiol (ug/g)") +
theme(legend.position = "none")
print(p)
}
## [1] "Age at menarche for TER: 4.73"
## [1] "Age at menarche for TAB: 4.19"
## [1] "Age at menarche for MIL: 4.44"
## [1] "Age at menarche for TAN: 4.07"
## [1] "Age at menarche for MAY: 3.97"
## [1] "Age at menarche for MON: 5.11"
## [1] "Age at menarche for MIS: 3.88"
## [1] "Age at menarche for MUS: 5.11"
shapiro.test(females_for_menarche_panel_df$e_conc_ug) #nope
##
## Shapiro-Wilk normality test
##
## data: females_for_menarche_panel_df$e_conc_ug
## W = 0.46254, p-value < 2.2e-16
shapiro.test(log(females_for_menarche_panel_df$e_conc_ug)) # log transformation does the trick!
##
## Shapiro-Wilk normality test
##
## data: log(females_for_menarche_panel_df$e_conc_ug)
## W = 0.99592, p-value = 0.1774
Log transformation does the trick! Data is now normal for our young female panel.
I am including individual ID as a random effect. In unadjusted model,
fixed effects are month and time of day (accounting for seasonality and
diurnal variation) with individual ID included as a random effect. Any
other things to control for?
The adjusted model includes age as a fixed effect. Age improves the
model fit!
unadjusted_e_model <- lmer(log(e_conc_ug) ~ + month + hour+ (1 | ind_id), data = females_for_menarche_panel_df)
adjusted_e_model <- lmer(log(e_conc_ug) ~ age_at_sample + month + hour+ (1 | ind_id), data = females_for_menarche_panel_df)
anova(unadjusted_e_model, adjusted_e_model) # adjusted_e_model fits better
## refitting model(s) with ML (instead of REML)
## Data: females_for_menarche_panel_df
## Models:
## unadjusted_e_model: log(e_conc_ug) ~ +month + hour + (1 | ind_id)
## adjusted_e_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## unadjusted_e_model 5 1879.4 1900.8 -934.70 1869.4
## adjusted_e_model 6 1751.7 1777.5 -869.86 1739.7 129.68 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_e_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## Data: females_for_menarche_panel_df
##
## REML criterion at convergence: 1758.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.16124 -0.62234 0.00391 0.65935 2.72881
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 0.150 0.3874
## Residual 1.461 1.2087
## Number of obs: 537, groups: ind_id, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -2.78282 0.32894 -8.460
## age_at_sample 0.50977 0.04226 12.062
## month 0.08792 0.01492 5.894
## hour -0.04200 0.01934 -2.171
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month
## age_at_smpl -0.581
## month -0.257 0.022
## hour -0.616 -0.018 -0.031
#checking for multicollinearity -- low (~ 1)
vif(adjusted_e_model)
## age_at_sample month hour
## 1.000802 1.001467 1.001289
print("No multicollinearity -- yay!")
## [1] "No multicollinearity -- yay!"
Tldr: model passes diagnostics—yay
# checking model diagnostics
plot(adjusted_e_model) #resids vs leverage -- pretty good
lattice::qqmath(adjusted_e_model) ## qq norm plot -- good
plot(adjusted_e_model, # scale location plot -- pretty good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
NOTE: I am holding off on reporting on this until we have the updated behavioral data containing group scan observations of duck face….planning to adapt methods from Sen et al. 2022
library(dplyr)
library(ggplot2)
library(zoo) # For rollapply
# Panel of all individual females who have good E coverage before and after menarche
females_for_cycling_panel<-e_data |>
group_by(ind_id) |>
filter(min(age_at_sample)>3) |>
mutate(count = n()) |>
ungroup() |>
select(ind_id,count) |>
arrange(desc(count)) |>
distinct() |>
filter(count>25) |>
pull(ind_id)
females_for_cycling_panel_df<- e_data |>
filter(ind_id %in% females_for_cycling_panel) |>
mutate(date = as.Date(date)) |>
filter(date > as.Date("2021-12-31")) |>
select(ind_id, age_at_sample, sample_id, date, month, time, hour, e_conc_ug, is_pregnant) |>
arrange(ind_id, age_at_sample)
n_figure_2 <-nrow(females_for_cycling_panel_df)
create_continuous_segments <- function(df) {
df %>%
arrange(date) %>%
mutate(segment = cumsum(is_pregnant == "P") + 1) %>%
group_by(segment) %>%
filter(any(is_pregnant == "NP")) %>%
ungroup() %>%
mutate(segment = cumsum(is_pregnant == "P") + 1) %>%
filter(is_pregnant == "NP")
}
# Function to find local peaks
find_peaks <- function(df) {
df %>%
arrange(date) %>%
mutate(peak = rollapply(e_conc_ug, width = 3, FUN = function(x) x[2] == max(x), fill = NA, align = 'center')) %>%
filter(peak == TRUE) %>%
select(ind_id, date, e_conc_ug, is_pregnant)
}
# Function to calculate cycle lengths
calculate_cycle_lengths <- function(df) {
df <- df |>
filter(is_pregnant == "NP")
peaks <- find_peaks(df)
peaks <- peaks %>%
arrange(date) %>%
mutate(next_date = lead(date)) %>%
mutate(cycle_length = as.numeric(difftime(next_date, date, units = "days"))) %>%
filter(!is.na(cycle_length))
# Filter to only include cycle lengths within 45 days
valid_cycles <- peaks %>%
filter(cycle_length <= 45)
# Calculate average cycle length
avg_cycle_length <- valid_cycles %>%
summarise(avg_cycle_length = mean(cycle_length, na.rm = TRUE))
return(avg_cycle_length)
}
# Loop through each individual and create a ggplot for each
for (individual in females_for_cycling_panel) {
individual_data <- females_for_cycling_panel_df %>%
filter(ind_id == individual)
# Calculate cycle lengths for this individual
cycle_lengths <- calculate_cycle_lengths(individual_data)
# Create continuous segments based on NP periods
segmented_data <- create_continuous_segments(individual_data)
p<-ggplot(data = individual_data, aes(x = date, y = e_conc_ug)) +
geom_point(aes(color = is_pregnant)) +
ggtitle(paste0("Estradiol by date for individual ", individual, " (n = ", nrow(individual_data), " samples)")) +
xlab("Date") +
ylab("Fecal estradiol (ug/g)") +
theme(legend.position = "none") +
geom_line(data = segmented_data, aes(group = segment), color = "blue") + # Plot lines for continuous NP segments
geom_vline(xintercept = as.Date(cycle_lengths$avg_cycle_length), linetype = "dashed", color = "green")
print(p)
# Print cycle lengths for this individual
print(paste0("Average cycle length for individual ", individual, ": ", cycle_lengths$avg_cycle_length, " days"))
}
## [1] "Average cycle length for individual TRU: 13.5 days"
## [1] "Average cycle length for individual TOT: 29 days"
## [1] "Average cycle length for individual TAM: 22.875 days"
## [1] "Average cycle length for individual TES: 18.5454545454545 days"
## [1] "Average cycle length for individual MUL: 18.8181818181818 days"
## [1] "Average cycle length for individual MEZ: 19.75 days"
## [1] "Average cycle length for individual TET: 24.7142857142857 days"
## [1] "Average cycle length for individual TEL: NaN days"
Purple lines = observed duck face
for (individual in females_for_cycling_panel) {
individual_data <- females_for_cycling_panel_df %>%
filter(ind_id == individual)
# Calculate cycle lengths for this individual
cycle_lengths <- calculate_cycle_lengths(individual_data)
# Create continuous segments based on NP periods
segmented_data <- create_continuous_segments(individual_data)
# Get duckface dates for this individual
duckface_dates <- duckface_data %>%
filter(ind_id == individual) %>%
pull(date)
p <- ggplot(data = individual_data, aes(x = date, y = e_conc_ug)) +
geom_point(aes(color = is_pregnant)) +
ggtitle(paste0("Estradiol by date for individual ", individual, " (n = ", nrow(individual_data), " samples)")) +
xlab("Date") +
ylab("Fecal estradiol (ug/g)") +
theme(legend.position = "none") +
geom_line(data = segmented_data, aes(group = segment), color = "blue") + # Plot lines for continuous NP segments
geom_vline(xintercept = as.numeric(duckface_dates), linetype = "solid", color = "purple") + # Add vertical lines for duckface dates
geom_vline(xintercept = as.Date(cycle_lengths$avg_cycle_length), linetype = "dashed", color = "green")
print(p)
# Print cycle lengths for this individual
print(paste0("Average cycle length for individual ", individual, ": ", cycle_lengths$avg_cycle_length, " days"))
}
## [1] "Average cycle length for individual TRU: 13.5 days"
## [1] "Average cycle length for individual TOT: 29 days"
## [1] "Average cycle length for individual TAM: 22.875 days"
## [1] "Average cycle length for individual TES: 18.5454545454545 days"
## [1] "Average cycle length for individual MUL: 18.8181818181818 days"
## [1] "Average cycle length for individual MEZ: 19.75 days"
## [1] "Average cycle length for individual TET: 24.7142857142857 days"
## [1] "Average cycle length for individual TEL: NaN days"
# Combine data from all individuals
all_peaks <- data.frame()
all_duckface_events <- data.frame()
for (individual in females_for_cycling_panel) {
individual_data <- females_for_cycling_panel_df %>%
filter(ind_id == individual)
# Find peaks for this individual
peaks <- find_peaks(individual_data)
all_peaks <- rbind(all_peaks, peaks)
# Get duckface dates for this individual
duckface_dates <- duckface_data %>%
filter(ind_id == individual) %>%
pull(date)
duckface_events <- data.frame(ind_id = individual, date = duckface_dates)
all_duckface_events <- rbind(all_duckface_events, duckface_events)
}
# Check for duckface events within ±4 days of peak estradiol dates
all_peaks <- all_peaks %>%
mutate(duckface_within_4_days = sapply(date, function(peak_date) {
any(abs(difftime(all_duckface_events$date[all_duckface_events$ind_id == ind_id], peak_date, units = "days")) <= 4)
}))
## Warning: There were 103 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `duckface_within_4_days = sapply(...)`.
## Caused by warning in `all_duckface_events$ind_id == ind_id`:
## ! longer object length is not a multiple of shorter object length
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 102 remaining warnings.
# Summarize results
summary_table <- all_peaks %>%
group_by(ind_id) %>%
summarise(
total_peaks = n(),
peaks_with_duckface = sum(duckface_within_4_days),
proportion_with_duckface = mean(duckface_within_4_days)
)
# Perform statistical test
# Create a contingency table
contingency_table <- table(all_peaks$duckface_within_4_days)
# Perform chi-squared test
chi_squared_test <- chisq.test(contingency_table)
# Print results
print(summary_table)
## # A tibble: 7 × 4
## ind_id total_peaks peaks_with_duckface proportion_with_duckface
## <chr> <int> <int> <dbl>
## 1 MEZ 18 1 0.0556
## 2 MUL 16 0 0
## 3 TAM 15 1 0.0667
## 4 TES 15 1 0.0667
## 5 TET 10 1 0.1
## 6 TOT 12 0 0
## 7 TRU 17 1 0.0588
print(chi_squared_test)
##
## Chi-squared test for given probabilities
##
## data: contingency_table
## X-squared = 83.971, df = 1, p-value < 2.2e-16
Could you please share the estimated/confirmed ages of all of our IDs? I can’t really assign this easily (unless I work backwards from age at sample info….). I know I had made a Google Sheet at some point with this information, but I no longer have access. Thanks!
e_data <- e_data %>%
mutate(across(c(B2,B3,B4), ~ na_if(.x, " ")))
e_data$B1 <- as.Date(e_data$B1)
e_data$B2 <- as.Date(e_data$B2)
e_data$B3 <- as.Date(e_data$B3)
e_data$B4 <- as.Date(e_data$B4)
e_data$date <- as.Date(e_data$date)
# # Calculate the difference in months
#
gestation_df_long <- e_data %>%
pivot_longer(cols = c(B1, B2, B3, B4), names_to = "birth_number", values_to = "birthing_date")
gestation_df <- gestation_df_long |>
mutate(months_from_birth = interval(date, birthing_date) / months(1)) |>
select(ind_id, age_at_sample, month, months_from_birth, e_conc_ug, birth_number)
gestating_df<- gestation_df |>
filter(months_from_birth >= 0 & months_from_birth <= 9) |>
mutate(months_from_birth = -1*months_from_birth)
gestating_df_females<-gestating_df |>
distinct(ind_id) |>
pull(ind_id)
not_gestating_df<-gestation_df |>
filter(is.na(months_from_birth) | months_from_birth > 9) |>
filter(age_at_sample > 6) |>
group_by(ind_id) |>
summarise(non_gestating_e2 = mean(e_conc_ug))
# Plot the data for each individual
for (individual in gestating_df_females) {
individual_data <- gestating_df %>%
filter(ind_id == individual)
# Establish baseline E2 for the individual
baseline_e <- not_gestating_df |>
filter(ind_id == individual) |>
pull(non_gestating_e2)
p <- ggplot(data = individual_data, aes(x = months_from_birth, y = e_conc_ug, color = birth_number, linetype = birth_number)) +
geom_point() +
geom_line() +
geom_hline(yintercept = baseline_e, linetype = "dashed", color = "green") +
ggtitle(paste0("Estradiol and gestation for ", individual, " (n = ", nrow(individual_data), " samples)")) +
xlab("Months from birth") +
ylab("Fecal estradiol (ug/g)") +
ylim(0, 150) +
xlim(-9, 0) +
theme(legend.position = "right") +
labs(color = "Birth Number", linetype = "Birth Number")
print(p)
}
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
QUESTION: I see some individuals are marekd as
pregnant (e..g, is_pregnant = “P”), but the
preg_trim column is blank. What do I do about this?
For now, I’m excluding pregnant individuals with missing trimester info.
pregnant_df<-e_data |>
select(ind_id, month, hour, age_at_sample, e_conc_ug, preg_trim, pregnancy_num, is_pregnant) |>
filter(is_pregnant == "P") |>
filter(preg_trim !="")
n_pregnant_df <- nrow(pregnant_df)
par(mfrow = c(1,1))
ggplot(data = pregnant_df, aes(x = preg_trim, y = e_conc_ug))+
geom_boxplot()+
geom_jitter(alpha = 0.1)+
ggtitle(paste("Estradiol and gestation (N = ", n_pregnant_df, "samples)")) +
xlab("Pregnancy trimester")+
ylab("Fecal estradiol (ug/g)")
Data is normal following log transformation
shapiro.test(pregnant_df$e_conc_ug)
##
## Shapiro-Wilk normality test
##
## data: pregnant_df$e_conc_ug
## W = 0.33877, p-value < 2.2e-16
shapiro.test(log(pregnant_df$e_conc_ug))
##
## Shapiro-Wilk normality test
##
## data: log(pregnant_df$e_conc_ug)
## W = 0.99539, p-value = 0.6763
Among trimesters, we only see that T2 is significantly lower than T3; no significant difference between T1-T2 or T1-T3.
anova_result_pregancy_e <- aov(log(e_conc_ug) ~ preg_trim, data = pregnant_df)
summary(anova_result_pregancy_e)
## Df Sum Sq Mean Sq F value Pr(>F)
## preg_trim 2 17.7 8.853 5.964 0.00296 **
## Residuals 242 359.2 1.484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey_results_pregnancy_e <- TukeyHSD(anova_result_pregancy_e)
print(tukey_results_pregnancy_e)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = log(e_conc_ug) ~ preg_trim, data = pregnant_df)
##
## $preg_trim
## diff lwr upr p adj
## T2-T1 -0.3732333 -0.8473855 0.1009188 0.1537996
## T3-T1 0.2486747 -0.2191212 0.7164707 0.4229352
## T3-T2 0.6219081 0.1957494 1.0480667 0.0019638
## what other control variables should I include?
unadjusted_pregnancy_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour+ (1 | ind_id), data = pregnant_df)
adjusted_pregnancy_model <- lmer(log(e_conc_ug) ~ age_at_sample + month + hour+preg_trim + (1 | ind_id), data = pregnant_df)
anova(unadjusted_pregnancy_model, adjusted_pregnancy_model) # adjusted model is better fit--pregnancy predicts E
## refitting model(s) with ML (instead of REML)
## Data: pregnant_df
## Models:
## unadjusted_pregnancy_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## adjusted_pregnancy_model: log(e_conc_ug) ~ age_at_sample + month + hour + preg_trim + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df
## unadjusted_pregnancy_model 6 784.51 805.52 -386.25 772.51
## adjusted_pregnancy_model 8 775.48 803.49 -379.74 759.48 13.024 2
## Pr(>Chisq)
## unadjusted_pregnancy_model
## adjusted_pregnancy_model 0.001486 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_pregnancy_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + preg_trim + (1 |
## ind_id)
## Data: pregnant_df
##
## REML criterion at convergence: 781.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.74934 -0.62059 0.01613 0.67608 3.01420
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 0.4587 0.6773
## Residual 1.1646 1.0792
## Number of obs: 245, groups: ind_id, 23
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.78746 0.52446 3.408
## age_at_sample 0.01562 0.02276 0.686
## month 0.02944 0.02363 1.246
## hour -0.07165 0.02894 -2.476
## preg_trimT2 -0.17724 0.20993 -0.844
## preg_trimT3 0.41154 0.19692 2.090
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month hour prg_T2
## age_at_smpl -0.538
## month -0.372 -0.015
## hour -0.655 0.008 0.030
## preg_trimT2 -0.293 -0.080 0.452 -0.019
## preg_trimT3 -0.311 -0.017 0.403 -0.019 0.660
Tldr: model passes diagnostics—yay
#checking for multicollinearity -- low (~ 1)
vif(adjusted_pregnancy_model)
## GVIF Df GVIF^(1/(2*Df))
## age_at_sample 1.009100 1 1.004540
## month 1.291221 1 1.136319
## hour 1.002459 1 1.001229
## preg_trim 1.301658 2 1.068130
# checking model diagnostics
plot(adjusted_pregnancy_model) #resids vs leverage -- good
lattice::qqmath(adjusted_pregnancy_model) ## qq norm plot -- good...small tail on right
plot(adjusted_pregnancy_model, # scale location plot -- good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
Only looking at adults (ages >6)
lactation_df <- e_data |>
select(ind_id, month, hour, age_at_sample, e_conc_ug, lactating) |>
filter(age_at_sample > 6)
n_lactation_df<-nrow(lactation_df)
ggplot(data = lactation_df, aes(x = lactating, y = e_conc_ug))+
geom_boxplot()+
geom_jitter(alpha = 0.1)+
ggtitle(paste("Estradiol and lactation (N = ", n_pregnant_df, "samples)")) +
xlab("Lactating")+
ylab("Fecal estradiol (ug/g)")
Data is still not normal following log transformation
shapiro.test(lactation_df$e_conc_ug)
##
## Shapiro-Wilk normality test
##
## data: lactation_df$e_conc_ug
## W = 0.26856, p-value < 2.2e-16
shapiro.test(log(lactation_df$e_conc_ug))
##
## Shapiro-Wilk normality test
##
## data: log(lactation_df$e_conc_ug)
## W = 0.99144, p-value = 6.802e-05
Data is not normal, so can’t do ANOVA. Let’s use a nonparametric approach to see if E is significantly lower during lactation….it is (W = 98014, p < 0.001)
wilcox.test(log(e_conc_ug) ~ lactating, data = lactation_df)
##
## Wilcoxon rank sum test with continuity correction
##
## data: log(e_conc_ug) by lactating
## W = 98014, p-value = 5.847e-09
## alternative hypothesis: true location shift is not equal to 0
Technically, data should be normal for this. BUT I think our sample might be large enough that we’re okay violating this assumption?
## what other control variables should I include?
unadjusted_lactation_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour + (1 | ind_id), data = e_data)
adjusted_lactation_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour + lactating + (1 | ind_id), data = e_data)
anova(unadjusted_lactation_model,adjusted_lactation_model) # adjusted model is better fit--lactation predicts E
## refitting model(s) with ML (instead of REML)
## Data: e_data
## Models:
## unadjusted_lactation_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## adjusted_lactation_model: log(e_conc_ug) ~ age_at_sample + month + hour + lactating + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df
## unadjusted_lactation_model 6 4843.2 4874.7 -2415.6 4831.2
## adjusted_lactation_model 7 4803.3 4840.0 -2394.6 4789.3 41.974 1
## Pr(>Chisq)
## unadjusted_lactation_model
## adjusted_lactation_model 9.25e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_lactation_model) # lactation = lower E
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + lactating + (1 |
## ind_id)
## Data: e_data
##
## REML criterion at convergence: 4813.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1108 -0.6274 -0.0233 0.6412 3.3238
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 2.147 1.465
## Residual 1.626 1.275
## Number of obs: 1403, groups: ind_id, 44
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.633063 0.349677 -4.670
## age_at_sample 0.173253 0.020625 8.400
## month 0.061315 0.009774 6.273
## hour -0.022011 0.012923 -1.703
## lactatingYES -0.635455 0.094448 -6.728
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month hour
## age_at_smpl -0.587
## month -0.188 0.038
## hour -0.426 0.007 0.032
## lactatngYES 0.058 -0.182 -0.047 0.040
Tldr: model passes diagnostics—yay
#checking for multicollinearity -- low (~ 1)
vif(adjusted_lactation_model)
## age_at_sample month hour lactating
## 1.035517 1.004192 1.002907 1.038010
# checking model diagnostics
plot(adjusted_lactation_model) #resids vs leverage -- good
lattice::qqmath(adjusted_lactation_model) ## qq norm plot -- good...small tail on right
plot(adjusted_lactation_model, # scale location plot -- good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
Can I please have access to these data?